The objectives of this tutorial are to
Make sure you have these packages installed:
install.packages(c("tidyverse", "ISLR", "leaps", "patchwork", "rsample", "tidymodels", "glmnet", "skimr"))
Work your way through the textbook lab 6.5.1 Best Subset Selection, and the forward stepwise procedure in 6.5.2, then answer these questions. (Note that there is no tidymodels option for either of these options at the moment.)
regsubsets() function (part of the leaps library) performs best subset selection by identifying the best model that contains a given number of predictors, where best is quantified using RSS. By default it only examines up to 8 variable models. Which variables make the best 8 variable model?library(knitr)
library(tidyverse)
library(ISLR)
library(skimr)
library(leaps)
library(patchwork)
library(rsample)
library(tidymodels)
library(glmnet)
data(Hitters)
# Take a look at the data
skim(Hitters)
# Handle the missing values on Salary,
# by removing them
Hitters <- Hitters %>%
filter(!is.na(Salary))
# Subset selection
regfit.full <- regsubsets(Salary~., Hitters)
summary(regfit.full)
coef(regfit.full, 8)
# (Intercept) AtBat Hits Walks CHmRun CRuns
# 130.9691577 -2.1731903 7.3582935 6.0037597 1.2339718 0.9651349
# CWalks DivisionW PutOuts
# -0.8323788 -117.9657795 0.2908431
nvmax=19. Plot the model fit diagnostics for the best model of each size. What would these diagnostics suggest about an appropriate choice of models? Do your results compare with the text book results? Why not?regfit.full <- regsubsets(Salary~., data=Hitters, nvmax=19)
reg.summary <- summary(regfit.full)
#names(reg.summary)
models <- tibble(nvars=1:19, rsq=reg.summary$rsq,
rss=reg.summary$rss,
adjr2=reg.summary$adjr2,
cp=reg.summary$cp,
bic=reg.summary$bic)
p1 <- ggplot(models, aes(x=nvars, y=rsq)) + geom_line()
p2 <- ggplot(models, aes(x=nvars, y=rss)) + geom_line()
p3 <- ggplot(models, aes(x=nvars, y=adjr2)) + geom_line()
p4 <- ggplot(models, aes(x=nvars, y=cp)) + geom_line()
p5 <- ggplot(models, aes(x=nvars, y=bic)) + geom_line()
p1 + p2 + p3 + p4 + p5
regfit.fwd <- regsubsets(Salary~., data=Hitters, nvmax=19, method="forward")
#summary(regfit.fwd)
d <- tibble(nvars=0:19,
rss=regfit.fwd$rss)
ggplot(d, aes(x=nvars, y=rss)) + geom_line()
set.seed(1)
split <- initial_split(Hitters, 2/3)
h_tr <- training(split)
h_ts <- testing(split)
regfit.best <- regsubsets(Salary~., data=h_tr, nvmax=19)
test.mat <- model.matrix(Salary~., data=h_ts)
val.errors <- rep(NA,19)
for (i in 1:19) {
coefi <- coef(regfit.best, id=i)
pred <- test.mat[,names(coefi)]%*%coefi
val.errors[i] <- mean((h_ts$Salary-pred)^2)
}
val.errors
# [1] 190001.1 167091.2 164492.6 206160.8 193087.5 167994.7 169349.4 171537.4
# [9] 169573.8 168444.3 174006.9 177763.9 179017.7 172536.2 165313.8 165693.7
# [17] 168396.0 168291.9 168005.9
d2 <- tibble(nvars=1:19,
err = val.errors)
ggplot(d2, aes(x=nvars, y=err)) + geom_line()
coef(regfit.best, which.min(val.errors))
# (Intercept) Walks CRBI DivisionW
# 128.9273817 6.2366881 0.7743211 -173.5351448
coef(regfit.full, 10)
# (Intercept) AtBat Hits Walks CAtBat CRuns
# 162.5354420 -2.1686501 6.9180175 5.7732246 -0.1300798 1.4082490
# CRBI CWalks DivisionW PutOuts Assists
# 0.7743122 -0.8308264 -112.3800575 0.2973726 0.2831680
It is said that 10-fold cross-validation is a reasonable choice for dividing the data. What size data sets would this create for this data? Argue whether this is good or bad.
Here we will use lasso to fit a regularised regression model and compare the results with the best subset model.
min(val.errors)
# [1] 164492.6
coef(regfit.best, which.min(val.errors))
# (Intercept) Walks CRBI DivisionW
# 128.9273817 6.2366881 0.7743211 -173.5351448
grid <- 10^seq(10,-2,length=100)
x <- model.matrix(Salary~., h_tr)[,-1]
y <- h_tr$Salary
lasso.mod <- glmnet(x, y, alpha=1, lambda=grid)
# Need a coefficient matrix of 100(nlambda)x19(p)
# as.matrix function converts sparse format into this
coeffs <- as.matrix(lasso.mod$beta)
coeffs <- cbind(var=rownames(coeffs), coeffs)
cv <- coeffs %>% as_tibble() %>%
gather(nval, coeffs, -var) %>%
mutate(coeffs=as.numeric(coeffs)) %>%
mutate(lambda=rep(lasso.mod$lambda, rep(19, 100)))
p <- ggplot(cv, aes(x=lambda, y=coeffs, group=var, label=var)) + geom_line() +
scale_x_log10(limits=c(-1, 100))
plotly::ggplotly(p)
# This is how the sample code plots
#plot(lasso.mod, xvar="lambda", xlim=c(-1, 5))
# Do cross-validation, using glmnet's function
set.seed(1)
cv.out <- cv.glmnet(x, y, alpha=1)
#cv.df <- tibble(lambda=cv.out$lambda, mse=cv.out$cvm,
# mse_l=cv.out$cvlo, mse_h=cv.out$cvup)
#ggplot(cv.df, aes(x=lambda, y=mse)) + geom_point() +
# scale_x_log10() +
# geom_errorbar(aes(ymin=mse_l, ymax=mse_h))
# This is how the sample code plots
plot(cv.out)
# Fit a single model to the best lambda, predict the test set, and compute mse
bestlam <- cv.out$lambda.min
bestlam
# [1] 0.8398314
x_ts <- model.matrix(Salary~., h_ts)[,-1]
y_ts <- h_ts$Salary
lasso.pred <- predict(lasso.mod, s=bestlam, newx=x_ts)
y.test <- y_ts
mean((lasso.pred-y.test)^2)
# [1] 152504.4
# Fit the model
# alpha=1 means lasso
# Its strange that it still needs the grid of lambdas to fit
# but it seems necessary for the optimisation.
# Then the predictions for best lambda made with predict fn
out <- glmnet(x, y, alpha=1, lambda=grid)
lasso.coef <- predict(out, type="coefficients", s=bestlam)[1:20,]
lasso.coef
# (Intercept) AtBat Hits HmRun Runs
# 209.89878629 -0.64408201 2.52580439 0.48550310 -2.28299815
# RBI Walks Years CAtBat CHits
# 0.33802884 4.69234869 -5.81792236 -0.43786797 1.61886262
# CHmRun CRuns CRBI CWalks LeagueN
# 2.24788459 0.46622326 0.02739373 -0.27061904 125.32172895
# DivisionW PutOuts Assists Errors NewLeagueN
# -140.88654871 0.28622991 0.31107401 -4.24913862 -98.12621758
lasso.coef[lasso.coef!=0]
# (Intercept) AtBat Hits HmRun Runs
# 209.89878629 -0.64408201 2.52580439 0.48550310 -2.28299815
# RBI Walks Years CAtBat CHits
# 0.33802884 4.69234869 -5.81792236 -0.43786797 1.61886262
# CHmRun CRuns CRBI CWalks LeagueN
# 2.24788459 0.46622326 0.02739373 -0.27061904 125.32172895
# DivisionW PutOuts Assists Errors NewLeagueN
# -140.88654871 0.28622991 0.31107401 -4.24913862 -98.12621758
Only one variable is very important for the model, which is it? (It occurs in every list of variables returned as important, and has the highest coefficient in the lasso model.) Several more variables frequently appear as important, which ones are these? Several others, appear occasionally in some models but always have very small coefficients. Can you name one of these? What does this tell you about the relationship between Salary and all the predictors?